# KORA_S3S4 Data Analysis - Minimum Model
import math
import numpy as np
import statistics
import pandas as pd
import scipy.stats
import seaborn as sns
import os
import pandas_profiling as pp
from matplotlib import pyplot as plt
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import impute
import pandas_profiling as pp
import nbimporter
import Modeling
from IPython.core.debugger import set_trace
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.inspection import permutation_importance
import statsmodels.api as sts
%matplotlib inline
KORA_Noise_noMissing = pd.read_csv('C:\\Users\\sahar.behzadi\\Desktop\\Noise2Nako\\Data\\KORA_S3_S4\\KORA_Noise_noMissing_median.csv')
# Inputs
X = KORA_Noise_noMissing.drop(['hyper_p', 'bp_diast', 'bp_syst'], axis = 1)
X_mini = KORA_Noise_noMissing[['sex', 'age', 'bmi', 'smoking', 'lden_org']]
print('Data description \n')
print('Sex: Female = 0, Male = 1 \n Smoking: Current = 1, Ex-smoker = 2, Never smoker =3 \n ')
# Output
Y_hyper = KORA_Noise_noMissing['hyper_p'].astype(int)
Y_SBP = KORA_Noise_noMissing['bp_syst']
Y_DBP = KORA_Noise_noMissing['bp_diast']

Dis: Need to find a good root point where to perform the expansion
Adv: Can be applied to any model!
Dis: They are very local! Do not measure global impacts.
Adv: Fast, no optimization is required
Intuition : Every layer in a NN, for instance, is a composition of simpler functions (e.g., Relu)

import shap
import xgboost
shap.initjs()
model = xgboost.XGBRegressor().fit(X_mini, Y_SBP)
SHAP assigns each feature an importance value for a particular prediction.
For complex models, such as ensemble methods or deep networks, we cannot use the original model as its own best explanation because it is not easy to understand. Instead, we must use a simpler explanation model, which we define as any interpretable approximation of the original model . A surprising attribute of the class of additive feature attribution methods is the presence of a single unique solution in this class with three desirable properties (described below).
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.Explainer(model)
shap_values = explainer(X_mini)
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])
shap.plots.force(shap_values[0])
shap.plots.waterfall(shap_values[1])
shap.plots.force(shap_values[1])
print(X_mini.head(5))
print(shap_values[1])
print('The coefficients : ', shap_values[1].values / shap_values[1].data)
sex age bmi smoking lden_org
0 0.0 31.0 18.94 2.0 41.0
1 1.0 40.0 27.14 2.0 55.2
2 1.0 59.0 30.34 3.0 55.2
3 0.0 62.0 19.46 1.0 46.8
4 0.0 62.0 31.25 2.0 51.2
.values =
array([ 4.4161687, -6.436804 , 1.4671109, -1.3191336, 1.0423521],
dtype=float32)
.base_values =
130.54442
.data =
array([ 1. , 40. , 27.14, 2. , 55.2 ])
The coefficients : [ 4.41616869 -0.1609201 0.05405714 -0.65956682 0.01888319]
shap.plots.waterfall(shap_values[2])
shap.plots.force(shap_values[2])
observations = X_mini.to_numpy()
shap.initjs()
shap.force_plot(explainer.expected_value, explainer.shap_values(X_mini), feature_names=X_mini.columns)
shap.plots.force is slow for many thousands of rows, try subsampling your data.
# since the algorithm is somehow too slow for the original data we consider a sample and the following plots are for
# a sample data including 100 samples
observations = X_mini.sample(100, random_state=42).to_numpy()
shap.initjs()
shap.force_plot(explainer.expected_value, explainer.shap_values(observations), features=observations,
feature_names=X_mini.columns)
shap.initjs()
# for the sample
shap.summary_plot(explainer.shap_values(observations), features=observations, feature_names=X_mini.columns)
# for the entire dataset
shap.summary_plot(explainer.shap_values(X_mini), features=X_mini, feature_names=X_mini.columns)
# create a dependence scatter plot to show the effect of a single feature across the whole dataset
shap.plots.scatter(shap_values[:,"lden_org"], dot_size=2, x_jitter=0.5, color = shap_values[:,"age"])
shap.plots.scatter(shap_values[:,"age"], color=shap_values[:,"lden_org"])
shap.plots.scatter(shap_values[:,"bmi"], color=shap_values)
shap.plots.scatter(shap_values[:,"age"], color=shap_values)
Passing parameters norm and vmin/vmax simultaneously is deprecated since 3.3 and will become an error two minor releases later. Please pass vmin/vmax directly to the norm when creating it.
shap.plots.scatter(shap_values[:,"sex"], dot_size=2, x_jitter=0.5, color=shap_values)
shap.plots.scatter(shap_values[:,"smoking"], dot_size=2, x_jitter=0.5, color=shap_values)